# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import logging
from typing import ClassVar
from typing import Optional

from typing_extensions import override

from ..utils.feature_decorator import experimental
from .eval_case import Invocation
from .eval_case import InvocationEvents
from .eval_metrics import EvalMetric
from .eval_metrics import Interval
from .eval_metrics import MetricInfo
from .eval_metrics import MetricValueInfo
from .eval_metrics import PrebuiltMetrics
from .eval_metrics import RubricsBasedCriterion
from .llm_as_judge_utils import get_text_from_content
from .llm_as_judge_utils import get_tool_calls_and_responses_as_json_str
from .llm_as_judge_utils import get_tool_declarations_as_json_str
from .rubric_based_evaluator import RubricBasedEvaluator

logger = logging.getLogger("google_adk." + __name__)

_RUBRIC_BASED_FINAL_RESPONSE_QUALITY_V1_PROMPT = """
SPECIAL INSTRUCTION: think silently. Silent thinking token budget: 10240 tokens.

# Mission
Your mission is to evaluate the final answer quality of responses generated by an AI agent. You will be presented with a user prompt (<user_prompt>), the agent's response (<response>) to that user prompt, and a set of properties (<property>) that you must use to objectively assess the validity of the agent's response.
Only respond to the properties provided. Do not make up new properties.

# Rubric
"yes": The model's response fulfilled the property, OR the property's condition was not applicable to the response.
"no": The model's response met the conditions for the property to be applicable, but failed to fulfill it, or the property applies to a claim in the model's response that cannot be unambiguously verified using trusted evidence.

# Key Evaluation Principles
Your evaluation must follow a two-part process: first, collect trusted evidence from the agent's work, and second, judge the final answer against it.
1. **Establish Trusted Evidence from Tool Calls**: You must first examine the agent's tool calls to determine if they are procedurally sound, meaning that the agent used the appropriate tools with logical parameters to address the user's prompt.
  * Your ONLY sources of truth are the <user_prompt> and the direct output ('tool_response') from PROCEDURALLY SOUND tool calls found in the <response_steps>. Examples of procedural flaws include:
    * The agent failed to call a tool that will enable it to answer the user's prompt despite having all the necessary parameters to do so.
    * The agent called the tool with incorrect or missing parameters.
    * The agent called a tool that does not exist, or called a tool with a parameter that does not exist.
    * The agent's sequence of tool calls contains a logical error.
  * The following kinds of information ABSOLUTELY CANNOT BE USED to derive trusted evidence:
    * The agent's final answer.
    * The agent's reasoning, summaries, or any interpretations of the tool responses by the agent.
    * Any tool call that is flawed (e.g., queries the wrong file, contains incorrect logic).
  * You may not have access to the same tools as the agent, so do not attempt to call any tools yourself.
2. **Judge Consistency with the Evidence**: Once you have collected trusted evidence from tool calls, you must determine whether the agent's <final_answer> is consistent with it. A claim in the final answer is only considered correct if it can be unambiguously verified using this evidence.
  * If the necessary evidence is missing because the agent failed to make a correct and sound tool call, the final answer must be judged as failing the property.

While judging the final answer against the evidence, be flexible about how it is conveyed. Accept answers that are semantically equivalent (e.g., different phrasing) as long as they still fulfill the property. For numbers, accept answers that are numerically equivalent, allowing for minor differences in rounding or precision, as long as they do not alter a final conclusion (e.g., the outcome of a statistical test).

For each property follow these internal steps:
1. Understand the property and the key evaluation principles.
2. Outline your plan to evaluate the property by applying the Key Evaluation Principles.
3. Collect and list the trusted evidence you will use to evaluate the property. Note any procedural flaws in the tool calls.
4. Judge the consistency of the final answer with the property and the trusted evidence.
5. Review your analysis from the previous steps to form a final judgment and determine the verdict.
6. Output the final verdict in the required output format.

# Output Format (repeat this format for every property, starting with a new line):
Property: [Repeat the property, word for word, without making any changes. Keep everything including punctuation and capitalization as-is.]
Evidence: [List all trusted evidence from tool calls or the user prompt that is relevant to the property (referencing the Step Index). Alternatively, if either no trusted evidence is required, or no trusted evidence exists (e.g., flawed process, missing tool call, tool error), explain why.]
Rationale: [Explain your reasoning, detailing how the evidence (or lack thereof) supports or contradicts the final answer, or why the property is not applicable.]
Verdict: [yes|no]

REMEMBER: Your answer will help improve the AI agent. It is important to determine the fulfillment of the properties correctly. Even answering "no" will improve the agent! Respond in pure text, not json.

# Example
## Input
<user_prompt>
  <developer_instructions>
  You are an AI agent who is an expert in HR data analysis.
  If a company has fewer than 100 employees, then the final answer should alert the user that there are fewer than 100 employees.
  If you have sufficient information and tools to respond to the user's question, then do not ask for further clarification.
  </developer_instructions>
  <available_tools>
  {{
    'name': 'load_hr_data_from_file',
    'description': 'Reads a data file from the company's HR database into a Pandas DataFrame.'
    'parameters': [
        {{
          'type': 'string',
          'name': 'file_name',
          'description': 'The name of the data file.'
        }},
    ],
    'required': ['file_name']
  }},
  {{
    'name': 'get_manager',
    'description': 'Returns the manager of a given employee.',
    'parameters': [
        {{
          'type': 'string',
          'name': 'employee_name',
          'description': 'The name of the employee.'
        }},
    ],
    'required': ['employee_name']
  }}
  </available_tools>
  <main_prompt>
  Using the employees.csv file, determine:
  1. the total number of employees
  2. the name of Alice Smith's manager
  3. the name of the employee with the highest salary, and their gender
  4. the average salary for the "Marketing" department
  Please format your final answer as a numbered list.
  </main_prompt>
</user_prompt>
<response>
  <response_steps>
  [
    {{
      "step_index": 0,
      "tool_call": "df = load_hr_data_from_file('employees.csv')\nprint(len(df))",
      "tool_response": "110",
    }},
    {{
      "step_index": 1,
      "tool_call": "print(df[df['Department'] == 'Engineering']['Salary'].mean())",
      "tool_response": "155000",
    }},
    {{
      "step_index": 2,
      "tool_call="print(df.loc[df['Salary'].idxmax(), 'Name'])",
      "tool_response": "John Smith",
    }},
  ]
  </response_steps>
  <final_answer>
  1. The total number of employees is 110.
  2. Please provide Alice Smith's employee ID so that I can find her manager.
  3. The employee with the highest salary is John Doe, and this employee's gender is male.
  4. The average salary for the Marketing department is 155000.
  </final_answer>
</response>

<properties>
* The final answer correctly identifies the total number of employees.
* The final answer correctly identifies the name of Alice Smith's manager, or correctly states that it cannot be determined and why.
* The final answer correctly states the average salary for the Marketing department.
* The final answer correctly identifies the employee with the highest salary.
* The final answer correctly identifies the gender of the employee with the highest salary, or correctly states that it cannot be determined and why.
* The final answer is formatted as a numbered list.
* If the company has fewer than 100 employees, then the final answer states that it has fewer than 100 employees.
</properties>

## Output
Property: The final answer correctly identifies the total number of employees.
Evidence: The trusted evidence is "110 employees". The tool call in Step 0 is procedurally sound and provides the total number of employees (110) by calling the load_hr_data_from_file tool with the correct file name.
Rationale: The final answer's claim ("110 employees") is fully consistent with the trusted evidence.
Verdict: yes

Property: The final answer correctly identifies the name of Alice Smith's manager, or correctly states that it cannot be determined and why.
Evidence: No trusted evidence exists. The agent did not perform a tool call to determine the manager of Alice Smith, despite having the necessary information (the employee name) and access to the necessary tools (get_manager) to do so.
Rationale: The agent incorrectly stated that the final answer cannot be determined, despite having the necessary information (the employee name) and tools (get_manager) to determine it.
Verdict: no

Property: The final answer correctly states the average salary for the Marketing department.
Evidence: No trusted evidence exists for the Marketing department's average salary. The tool call in Step 1 is procedurally flawed; the agent searched for "Engineering" instead of "Marketing".
Rationale: There is no trusted evidence for the Marketing department's average salary.
Verdict: no

Property: The final answer correctly identifies the employee with the highest salary.
Evidence: The trusted evidence is "John Smith". The tool call in Step 2 produces trusted evidence for the employee with the highest salary by calling the load_hr_data_from_file tool with the correct file name and then using the idxmax() method to find the employee with the highest salary.
Rationale: The final answer's claim ("John Doe") is inconsistent with the trusted evidence ("John Smith").
Verdict: no

Property: The final answer correctly identifies the gender of the employee with the highest salary, or correctly states that it cannot be determined and why.
Evidence: No trusted evidence exists. The agent did not perform a tool call to determine the gender of the employee with the highest salary.
Rationale: There is no trusted evidence to confirm the gender of the employee with the highest salary that the final answer states (male). Even if the gender is coincidentally actually male, the claim in the final answer cannot be unambiguously verified using the evidence.
Verdict: no

Property: If the company has fewer than 100 employees, then the final answer should state that it has fewer than 100 employees.
Evidence: The trusted evidence is "110 employees". The tool call in Step 0 correctly counts the total number of employees as 110 by calling the load_hr_data_from_file tool with the correct file name.
Rationale: The total number of employees is 110, so the condition for this property (fewer than 100 employees) was not met. Therefore, the property is not applicable to this response.
Verdict: yes

Property: The final answer is formatted as a numbered list.
Evidence: N/A. Trusted evidence from tool calls or the user prompt is not required in order to determine the format of the final answer.
Rationale: The final answer is formatted as a numbered list from 1 to 4, e.g. "1. The total number of employees is 110\n2...".
Verdict: yes

# Your Turn
## Input
<user_prompt>
  <developer_instructions>
  {developer_instructions}
  </developer_instructions>

  <available_tools>
  {tool_declarations}
  </available_tools>

  <main_prompt>
  {user_input}
  </main_prompt>
</user_prompt>

<response>
  <response_steps>
  {response_steps}
  </response_steps>
  <final_answer>
  {final_response}
  </final_answer>
</response>

<properties>
{rubrics}
</properties>

## Output
"""


@experimental
class RubricBasedFinalResponseQualityV1Evaluator(RubricBasedEvaluator):
  """An Evaluator for rubric based assessment of the agent's final response using a LLM.

  The evaluator uses a set of rubrics to assess the quality of the agent's
  final response.

  Example: For a weather agent that responds to weather related queries of the
  user, one could specify following rubrics:

  Rubric 1: Agent's response is direct and to the point.
  Rubric 2: Agent's response accurately inferred user's underlying goal from
  ambiguous queries (e.g. "is it a beach weather?" would mean sun, warmth and
  low wind)

  For each rubric, this evaluator will generate a confidence score between 0
  and 1, where 0 means that agent's response did not satisfy the rubric at all
  and 1 means complete adherence. Value closer to 1 are desirable.

  A combined score using individual rubric confidences will also be generated.
  Like individual rubric confidence scores, the range for this value will be
  between 0 and 1, and it will have the same interpretation.
  """

  criterion_type: ClassVar[type[RubricsBasedCriterion]] = RubricsBasedCriterion

  def __init__(self, eval_metric: EvalMetric):
    super().__init__(
        eval_metric,
        criterion_type=RubricBasedFinalResponseQualityV1Evaluator.criterion_type,
    )
    self._auto_rater_prompt_template = (
        _RUBRIC_BASED_FINAL_RESPONSE_QUALITY_V1_PROMPT
    )

  @staticmethod
  def get_metric_info() -> MetricInfo:
    return MetricInfo(
        metric_name=PrebuiltMetrics.RUBRIC_BASED_FINAL_RESPONSE_QUALITY_V1.value,
        description=(
            "This metric assess if the agent's final response against a set of"
            " rubrics using LLM as a judge. Value range for this metric is"
            " [0,1], with values closer to 1 more desirable."
        ),
        metric_value_info=MetricValueInfo(
            interval=Interval(min_value=0.0, max_value=1.0)
        ),
    )

  @override
  def format_auto_rater_prompt(
      self, actual_invocation: Invocation, _: Optional[Invocation]
  ) -> str:
    """Returns the autorater prompt."""

    user_input = get_text_from_content(actual_invocation.user_content)
    final_response = get_text_from_content(actual_invocation.final_response)
    rubrics = "\n*  ".join(
        [r.rubric_content.text_property for r in self._rubrics]
    )

    developer_instructions = ""
    tool_declarations = "Agent has no tools."
    response_steps = get_tool_calls_and_responses_as_json_str(
        actual_invocation.intermediate_data
    )

    app_details = actual_invocation.app_details
    if app_details:
      if (
          isinstance(actual_invocation.intermediate_data, InvocationEvents)
          and actual_invocation.intermediate_data.invocation_events
      ):
        developer_instructions = app_details.get_developer_instructions(
            agent_name=actual_invocation.intermediate_data.invocation_events[
                0
            ].author
        )
      tool_declarations = get_tool_declarations_as_json_str(app_details)

    auto_rater_prompt = self._auto_rater_prompt_template.format(
        developer_instructions=developer_instructions,
        tool_declarations=tool_declarations,
        user_input=user_input,
        response_steps=response_steps,
        final_response=final_response,
        rubrics=rubrics,
    )

    return auto_rater_prompt
